#load dependencies
require(png)
require(plyr)
require(dplyr)
require(osmdata)
require(Hmisc)
require(usedist)
require(leaflet)
require(sf)
require(geosphere)
require(doParallel)
require(foreach)
require(plotly)
require(knitr)
require(gplots)

Synopsis

Data

Features

The data have been downloaded from the online property website domain.com.au by a member of the online data science community kaggle.com, you can see the post here. Thank you to user anthonypino.

Quality Check

#----read in data
property_data <- read.csv('original_data/Melbourne_housing_FULL.csv')

#----rename columns
names(property_data) <- c('suburb','add','nrooms','type','price','method','agent','date',
              'precomputeddist','pc','nbedroom','nbathroom','ncar','land_area',
              'building_area','year_built','council_area','lat','lng','region',
              'propcount')

#----recast types
property_data$date = as.Date(property_data$date,format='%d/%m/%Y')
## Warning in strptime(x, format, tz = "GMT"): unable to identify current timezone 'C':
## please set environment variable 'TZ'
property_data$suburb = property_data$suburb %>% 
    as.character %>% capitalize %>% as.factor
property_data$add = as.character(property_data$add)
property_data$propcount = as.numeric(property_data$propcount)
property_data$precomputeddist = as.numeric(property_data$precomputeddist)

#----compute distance from city
MELBOURNE_CENTRE = c(144.9631,-37.8136)
locs = select(property_data,c(lng,lat))
property_data$dist_cbd = apply(locs,1,function(loc){distm(loc,MELBOURNE_CENTRE)})

#give every object an ID
property_data$ID = as.character(1:nrow(property_data))

After parsing and an ID column is appended to the data, the table has 34857 records of sales, each with 23 features. View the first 100 rows of the data here.

discuss the data in its default form

write here download the data yourself here

#parse government data
gov_median_data <- read.csv('other_data/suburb_house_2017_edited.csv',stringsAsFactors = F)
names(gov_median_data)[1] <- 'suburb'
gov_median_data$suburb <- gov_median_data$suburb %>% tolower %>% capitalize

#limit property_data to 2017
property_data_2017 <- subset(property_data,date<'2018-01-01' & date > '2016-12-31')
#limit to houses
property_data_2017 = property_data_2017[property_data_2017$type=='h',]


#select relevant columns
gov_median_data =  gov_median_data %>% select(c(suburb,X2017))
#rename column
names(gov_median_data)[2] <- 'median_price'
property_data_2017 <- group_by(property_data_2017,suburb) %>% 
    summarise(median_price = median(price,na.rm=T),count=n())

#record how many properties were sold in this area, compared to total
property_data_2017$prop <- property_data_2017$count/sum(property_data_2017$count)

comparison_data = merge(property_data_2017,gov_median_data,by='suburb',suffixes = c('_pd','_gov'))


plot_ly(data=comparison_data,
        x=~median_price_gov,
        y=~median_price_pd,
        type='scatter',
        mode='markers',
        text=paste0(comparison_data$suburb,'\n',comparison_data$count,' sales'),
        hoverinfo='text'
) %>% add_lines(x=c(0,4e+6),y=c(0,4e+6),inherit = F) %>%
    layout(showlegend=F,
           xaxis=list('title'='Median Sale Price in our Dataset'),
           yaxis=list('title'='Median'))

Objectives

Data Preparation

skip alot of this, or collapse not bold

Exploration and Cleaning

fig <- readPNG('na_heatmap.png')
plot.new()
rasterImage(fig,0,0,1,1)

#----------------------lnglat
leaflet(property_data) %>% addTiles %>% addCircles(lng=~lng,lat=~lat,
                                                   opacity = 0.5,
                                                   fillOpacity = 0.5,
                                                   color='#0078D7',
                                                   fillColor='#0078D7'
                                                   )
#lng and lat have very few missing, dont wanna bother imputing
loc_bool <- !is.na(property_data$lng) & !(is.na(property_data$lat))
#----------------------year_built
#1 property was apparaently built in 1196 and another next century
to_correct = which(property_data$year_built > 2030| property_data$year_built < 1788) 
property_data[to_correct,'year_built'] = NA
#--------------------ncar
ggplot(data= property_data, aes(x=ncar,y=log(price))) + 
    geom_jitter(alpha=0.2,color='#0078D7')

#limit the scope of analysis to properties with fewer than 7 car parks
ncar_bool <- is.na(property_data$ncar) | property_data$ncar<=6
#--------------------nbathroom
ggplot(property_data,aes(x=nbathroom,y=log(price))) + 
    geom_jitter(alpha=0.2,color='#0078D7')

#limit the analysis to properties with nbathroom <= 4
bath_bool <- !is.na(property_data$nbathroom) & (property_data$nbathroom<=4 & property_data$nbathroom>0)
#---------------------nrooms
ggplot(property_data,aes(x=nrooms,y=log(price))) + 
    geom_jitter(alpha=0.2,color='#0078D7')

#limit the analysis to properties with nrooms <=6
nroom_bool <- property_data$nrooms<=6 & property_data$nrooms>0
#---------------------nbedroom
sum(is.na(property_data$nbedroom))
## [1] 8217
has_bedroom = !is.na(property_data$nbedroom)
cor(property_data$nbedroom[has_bedroom],property_data$nrooms[has_bedroom])
## [1] 0.9467546
#dont include in analysis
#----------------------building_area
ggplot(property_data,aes(x=building_area,y=log(price))) + 
    geom_point(alpha=0.2,color='#0078D7')

#make ==0 NA
property_data[is.na(property_data$building_area) | property_data$building_area==0,'building_area'] = NA
#limit analysis to buildings with less than 1000 building_area
BA_bool <- is.na(property_data$building_area) | property_data$building_area<1000
#----------------------land_area
ggplot(property_data,aes(x=land_area,y=log(price))) + 
    geom_point(alpha=0.2,color='#0078D7')

#limit analysis to properties with land_area < 10000
land_bool <- is.na(property_data$land_area) | property_data$land_area < 10000
#---------------------actions

#gather bools of which rows can be kept, intersection will be kept
property_data <- property_data[nroom_bool & bath_bool & land_bool & BA_bool & ncar_bool &loc_bool,]

#replot
ggplot(property_data,aes(x=building_area,y=log(price))) + 
    geom_point(alpha=0.2,color='#0078D7')

ggplot(property_data,aes(x=land_area,y=log(price))) +
    geom_point(alpha=0.2,color='#0078D7')

save(property_data,file='property_data_from_Rmd_file.Robject')

Validation

The validation scheme is as follows:

This scheme has two main benefits over other popular methods:

  • It is fair. explain this

  • It used all of the training/ensemble validation data for the final model. explain this

Validation Scheme Diagram

Validation Scheme Diagram

Reconstruction

na_prop = sapply(property_data,function(x){mean(is.na(x))})
na_prop_df = data.frame('feature'=names(na_prop),'prop_na'=na_prop)
na_prop_df = na_prop_df[na_prop_df$prop_na>0,]
na_prop_df = na_prop_df[order(na_prop_df$prop_na,decreasing = T),]
na_prop_df$prop_na = round(na_prop_df$prop_na,2)

#write a 
na_prop_df  %>% kable(row.names=F,
                      col.names = c('Feature','Proportion of Objects with NA Values'))
Feature Proportion of Objects with NA Values
building_area 0.49
year_built 0.42
price 0.22
land_area 0.14
ncar 0.02

note recon after validation

include some plots with the red coloring

dont harp on

Importing data from OSM

Primary Models

consider including a pic of error/sample size to justify not imputing but point out i explored it

K-Nearest Neighbours

**acknowledge limitations of CV in that i chose feats before CV, not to worry, it wont

Gradient Boosted Trees

Linear methods

Ensembling

Discussion

Conclusion